library(rpart)
library(tidyverse)
library(splines)
library(Rtsne)
library(RColorBrewer)
#install.packages("devtools")
#devtools::install_github("bmschmidt/wordVectors")
library(wordVectors)
library(tidytext)
# Generating sample data
n=300
set.seed(1)
u=sort(runif(n)*5*pi)
y = sin(u)+rnorm(n)/4
df = data.frame(x=u,y=y)
# Setting up parameters
# v=.05
number_of_weak_learners = 100
number_of_knots_split = 6
polynomial_degree = 2
fit = rpart(y~x, data=df)
# Helper function to plot
plot_ <- function(v,fit,number_of_weak_learners) {
yp = predict(fit,newdata=df)
df$yr = df$y - v*yp
YP = v*yp
list_of_weak_learners = list(fit)
for(t in 2:number_of_weak_learners){
# Fit linear spline
fit = rpart(yr~x,data=df)
# Generate new prediction
yp=predict(fit,newdata=df)
# Update residuals
df$yr=df$yr - v*yp
# Bind to new data point
YP = cbind(YP,v*yp)
# Store fitted model in list
list_of_weak_learners[[t]] = fit
}
##############################################
##### Getting predictions for each boost #####
##############################################
for (i in 1:number_of_weak_learners){
# Calculating performance of first i weak_learners
# Summing weak learner residuals
if(i==1){yp_i = YP[,1:i]
}else{yp_i=apply(YP[,1:i],1,sum) #<- strong learner
}
# Binds new cols
col_name = paste0('yp_',i)
df = df %>% bind_cols(yp=yp_i)
}
df111 = df[,c(-2,-3)]
# Re-arrange sequences to get pseudo residuals
plot_wl = df111 %>%
pivot_longer(cols = starts_with("yp")) %>%
mutate(learner = str_match(name,"[0-9]+")) %>%
mutate(learner = as.integer(ifelse(is.na(learner),0,learner)))
# Plot final learner
final_learner = plot_wl %>% filter(learner == (number_of_weak_learners-1))
# Plot progression of learner
ggplot() +
# Visualizing all learners
geom_line(aes(x = x, y = value, group = learner, color =learner),
data = plot_wl,alpha=0.5) +
# Final learner
geom_line(aes(x = x, y = value, group = learner, color =learner),
data = final_learner,alpha=0.5,color = 'firebrick1',size = 2) +
geom_point(aes(x = x, y= y),data = df)+ # true values
theme_minimal()
}
We first plot \(v=0.125\).
plot_(v=0.125, fit, number_of_weak_learners)
We now plot \(v=0.01\) and \(v=0.05\).
plot_(v=0.01,fit,number_of_weak_learners)
plot_(v=0.05,fit, number_of_weak_learners)
rm(list=ls())
n=500
set.seed(1)
u=sort(runif(n)*5*pi)
y = sin(u)+rnorm(n)/4
df = data.frame(x=u,y=y)
idx=sample(seq_len(n), size =50)
test <- df[idx, ]
remain <- df[-idx,]
df<-remain
row.names(df) <- NULL
valid_idx=sample(seq_len(450), size =50)
valid<-df[valid_idx,]
remain<-df[-valid_idx,]
df<-remain
row.names(df) <- NULL
fit_early_stop<-function(model){
diff = 100000000000
t=1
v=.05
yp = predict(model,newdata=df)
df$yr = df$y - v*yp
YP = v*yp
list_of_weak_learners = list(model)
y_valid_p=v*predict(model,newdata=valid)
y_rmse_valid=sqrt(mean(abs(valid$y-y_valid_p)**2))
while (diff>0.01){
t=t+1
fit = rpart(yr ~ x,data=df)
yp=predict(fit,newdata=df)
list_of_weak_learners[[t]] = fit
df$yr=df$yr - v*yp
YP = cbind(YP,v*yp)
y_valid_p=y_valid_p+v*predict(fit,newdata=valid)
y_rmse_valid_new=sqrt(mean(abs(valid$y-y_valid_p)**2))
diff=abs(y_rmse_valid_new-y_rmse_valid)
y_rmse_valid=y_rmse_valid_new
}
t=t-1
##############################################
##### Getting predictions for each boost #####
##############################################
for (i in 1:t){
# Calculating performance of first i weak_learners
# Summing weak learner residuals
if(i==1){yp_i = YP[,1:i]
}else{yp_i=apply(YP[,1:i],1,sum) #<- strong learner
}
# Binds new cols
col_name = paste0('yp_',i)
df = df %>% bind_cols(yp=yp_i)
}
df111 = df[,c(-2,-3)]
# Re-arrange sequences to get pseudo residuals
plot_wl = df111 %>%
pivot_longer(cols = starts_with("yp")) %>%
mutate(learner = str_match(name,"[0-9]+")) %>%
mutate(learner = as.integer(ifelse(is.na(learner),0,learner)))
# Plot final learner
final_learner = plot_wl %>% filter(learner == (t-1))
# Plot progression of learner
ggplot() +
# Visualizing all learners
geom_line(aes(x = x, y = value, group = learner, color =learner),
data = plot_wl,alpha=0.5) +
# Final learner
geom_line(aes(x = x, y = value, group = learner, color =learner),
data = final_learner,alpha=0.5,color = 'firebrick1',size = 2) +
geom_point(aes(x = x, y= y),data = df)+ # true values
ggtitle('Plot progression of learner with original learning paremater v=0.05')+
theme_minimal()
cat('The number of trees is',t)
return(list_of_weak_learners)
}
model=rpart(y~x,data=df)
list_of_weak_learners = fit_early_stop(model)
## The number of trees is 18
From the result above we’ve noticed that by measuring the difference between the previous RMSE and the current RMSE, we perform an early stop if the difference is less than \(0.01\), and the number of trees is \(18\).
v=0.05
t=18
for (i in 1:18){
weak_learner_i = list_of_weak_learners[[i]]
if (i==1){pred = v*predict(weak_learner_i,test)}
else{pred =pred + v*predict(weak_learner_i,test)}
if(i==t){
test = test %>% bind_cols(yp=pred)
}
}
cat('The RMSE on the test set is',sqrt(sum((test$yp-test$y)**2))/50)
## The RMSE on the test set is 0.06401363
rm(list=ls())
n=500
set.seed(1)
u=sort(runif(n)*5*pi)
y = sin(u)+rnorm(n)/4
df = data.frame(x=u,y=y)
idx=sample(seq_len(n), size =50)
test <- df[idx, ]
remain <- df[-idx,]
df<-remain
row.names(df) <- NULL
valid_idx=sample(seq_len(450), size =50)
valid<-df[valid_idx,]
remain<-df[-valid_idx,]
df<-remain
row.names(df) <- NULL
fit_grid_search<-function(minsplit,cp,maxdepth){
model=rpart(y~x,data=df,control = rpart.control(minsplit = minsplit,cp = cp,maxdepth = maxdepth))
list_of_weak_learners = list(model)
v=.05
number_of_weak_learners = 100
yp = predict(model,newdata=df)
df$yr = df$y - v*yp
YP = v*yp
list_of_weak_learners = list(model)
for(t in 2:number_of_weak_learners){
# Fit linear spline
fit = rpart(yr~x,data=df,control = rpart.control(minsplit = minsplit,cp = cp,maxdepth = maxdepth))
# Generate new prediction
yp=predict(fit,newdata=df,control = rpart.control(minsplit = minsplit,cp = cp,maxdepth = maxdepth))
# Update residuals
df$yr=df$yr - v*yp
# Bind to new data point
YP = cbind(YP,v*yp)
# Store fitted model in list
list_of_weak_learners[[t]] = fit
}
for (i in 1:t){
weak_learner_i = list_of_weak_learners[[i]]
if (i==1){pred = v*predict(weak_learner_i,test)}
else{pred =pred + v*predict(weak_learner_i,test)}
if(i==t){
test = test %>% bind_cols(yp=pred)
}
}
return(sqrt(sum((test$yp-test$y)**2)))
}
res = data.frame(minsplit = rep(0,27),cp = rep(0,27),maxdepth = rep(0,27),RMSE = rep(0,27))
i=1
for (p1 in c(5,20,50)){
for (p2 in c(0.1,0.01,0.001)){
for (p3 in c(5,10,30)){
res[i,] = c(p1,p2,p3,fit_grid_search(p1,p2,p3)/50)
i=i+1
}
}
}
res
## minsplit cp maxdepth RMSE
## 1 5 0.100 5 0.06651546
## 2 5 0.100 10 0.06651546
## 3 5 0.100 30 0.06651546
## 4 5 0.010 5 0.04126301
## 5 5 0.010 10 0.04102522
## 6 5 0.010 30 0.04102522
## 7 5 0.001 5 0.03871902
## 8 5 0.001 10 0.04042144
## 9 5 0.001 30 0.04145527
## 10 20 0.100 5 0.06651546
## 11 20 0.100 10 0.06651546
## 12 20 0.100 30 0.06651546
## 13 20 0.010 5 0.04143858
## 14 20 0.010 10 0.04145343
## 15 20 0.010 30 0.04145343
## 16 20 0.001 5 0.03861811
## 17 20 0.001 10 0.03842747
## 18 20 0.001 30 0.03856957
## 19 50 0.100 5 0.06651546
## 20 50 0.100 10 0.06651546
## 21 50 0.100 30 0.06651546
## 22 50 0.010 5 0.04352319
## 23 50 0.010 10 0.04308999
## 24 50 0.010 30 0.04308999
## 25 50 0.001 5 0.04039283
## 26 50 0.001 10 0.04027163
## 27 50 0.001 30 0.04021259
out = res[res$RMSE==min(res$RMSE),]
cat("The Best Parameters are:")
## The Best Parameters are:
out
## minsplit cp maxdepth RMSE
## 17 20 0.001 10 0.03842747
df = read.csv('./Part2/kernel_regression_2.csv')
colnames(df) = c('x1','x2','y')
plot_ <- function(v,fit,number_of_weak_learners){
#v=0.05
yp = predict(fit,newdata=df)
df$yr = df$y - v*yp
YP = v*yp
list_of_weak_learners = list(fit)
for(t in 2:number_of_weak_learners){
# Fit linear spline
fit = rpart(yr~x1+x2,data=df)
# Generate new prediction
yp=predict(fit,newdata=df)
# Update residuals
df$yr=df$yr - v*yp
# Bind to new data point
YP = cbind(YP,v*yp)
# Store fitted model in list
list_of_weak_learners[[t]] = fit
}
##############################################
##### Getting predictions for each boost #####
##############################################
for (i in 1:number_of_weak_learners){
# Calculating performance of first i weak_learners
# Summing weak learner residuals
if(i==1){yp_i = YP[,1:i]
}else{yp_i=apply(YP[,1:i],1,sum) #<- strong learner
}
# Binds new cols
col_name = paste0('yp_',i)
df = df %>% bind_cols(yp=yp_i)
}
df111 = df[,c(-2,-3)]
# Re-arrange sequences to get pseudo residuals
plot_wl = df111 %>%
pivot_longer(cols = starts_with("yp")) %>%
mutate(learner = str_match(name,"[0-9]+")) %>%
mutate(learner = as.integer(ifelse(is.na(learner),0,learner)))
# Plot final learner
final_learner = plot_wl %>% filter(learner == (number_of_weak_learners-1))
# Plot progression of learner
ggplot() +
# Visualizing all learners
geom_line(aes(x = x1, y = value, group = learner, color =learner),
data = plot_wl,alpha=0.5) +
# Final learner
geom_line(aes(x = x1, y = value, group = learner, color =learner),
data = final_learner,alpha=0.5,color = 'firebrick1',size = 2) +
geom_point(aes(x = x1, y= y),data = df)+ # true values
theme_minimal()
}
model = rpart(y~x1+x2,data=df)
plot_(v=0.125,fit=model,number_of_weak_learners=100)
plot_(v=0.01,fit=model,number_of_weak_learners=100)
plot_(v=0.05,fit=model,number_of_weak_learners = 100)
rm(list=ls())
df = read.csv('./Part2/kernel_regression_2.csv')
n=length(df[,1])
colnames(df) = c('x1','x2','y')
idx=sample(seq_len(n), size =50)
test <- df[idx, ]
remain <- df[-idx,]
df<-remain
row.names(df) <- NULL
valid_idx=sample(seq_len(n-50), size =50)
valid<-df[valid_idx,]
remain<-df[-valid_idx,]
df<-remain
row.names(df) <- NULL
fit_early_stop<-function(model){
diff = 100000000000
t=1
v=.05
yp = predict(model,newdata=df)
df$yr = df$y - v*yp
YP = v*yp
list_of_weak_learners = list(model)
y_valid_p=v*predict(model,newdata=valid)
y_rmse_valid=sqrt(mean(abs(valid$y-y_valid_p)**2))
while (diff>0.01){
t=t+1
fit = rpart(yr ~ x1+x2,data=df)
yp=predict(fit,newdata=df)
list_of_weak_learners[[t]] = fit
df$yr=df$yr - v*yp
YP = cbind(YP,v*yp)
y_valid_p=y_valid_p+v*predict(fit,newdata=valid)
y_rmse_valid_new=sqrt(mean(abs(valid$y-y_valid_p)**2))
diff=abs(y_rmse_valid_new-y_rmse_valid)
y_rmse_valid=y_rmse_valid_new
}
t=t-1
##############################################
##### Getting predictions for each boost #####
##############################################
for (i in 1:t){
# Calculating performance of first i weak_learners
# Summing weak learner residuals
if(i==1){yp_i = YP[,1:i]
}else{yp_i=apply(YP[,1:i],1,sum) #<- strong learner
}
# Binds new cols
col_name = paste0('yp_',i)
df = df %>% bind_cols(yp=yp_i)
}
df111 = df[,c(-2,-3)]
# Re-arrange sequences to get pseudo residuals
plot_wl = df111 %>%
pivot_longer(cols = starts_with("yp")) %>%
mutate(learner = str_match(name,"[0-9]+")) %>%
mutate(learner = as.integer(ifelse(is.na(learner),0,learner)))
# Plot final learner
final_learner = plot_wl %>% filter(learner == (t-1))
# Plot progression of learner
ggplot() +
# Visualizing all learners
geom_line(aes(x = x, y = value, group = learner, color =learner),
data = plot_wl,alpha=0.5) +
# Final learner
geom_line(aes(x = x, y = value, group = learner, color =learner),
data = final_learner,alpha=0.5,color = 'firebrick1',size = 2) +
geom_point(aes(x = x, y= y),data = df)+ # true values
ggtitle('Plot progression of learner with original learning paremater v=0.05')+
theme_minimal()
cat('The number of trees is',t)
return(list_of_weak_learners)
}
model=rpart(y~x1+x2,data=df)
list_of_weak_learners = fit_early_stop(model)
## The number of trees is 24
v=0.05
t=18
for (i in 1:18){
weak_learner_i = list_of_weak_learners[[i]]
if (i==1){pred = v*predict(weak_learner_i,test)}
else{pred =pred + v*predict(weak_learner_i,test)}
if(i==t){
test = test %>% bind_cols(yp=pred)
}
}
cat('The RMSE on the test set is',sqrt(sum((test$yp-test$y)**2))/100)
## The RMSE on the test set is 0.03765656
fit_grid_search<-function(minsplit,cp,maxdepth){
model=rpart(y~x1+x2,data=df,control = rpart.control(minsplit = minsplit,cp = cp,maxdepth = maxdepth))
list_of_weak_learners = list(model)
v=.05
number_of_weak_learners = 100
yp = predict(model,newdata=df)
df$yr = df$y - v*yp
YP = v*yp
list_of_weak_learners = list(model)
for(t in 2:number_of_weak_learners){
# Fit linear spline
fit = rpart(yr~x1+x2,data=df,control = rpart.control(minsplit = minsplit,cp = cp,maxdepth = maxdepth))
# Generate new prediction
yp=predict(fit,newdata=df,control = rpart.control(minsplit = minsplit,cp = cp,maxdepth = maxdepth))
# Update residuals
df$yr=df$yr - v*yp
# Bind to new data point
YP = cbind(YP,v*yp)
# Store fitted model in list
list_of_weak_learners[[t]] = fit
}
for (i in 1:t){
weak_learner_i = list_of_weak_learners[[i]]
if (i==1){pred = v*predict(weak_learner_i,test)}
else{pred =pred + v*predict(weak_learner_i,test)}
if(i==t){
test = test %>% bind_cols(yp=pred)
}
}
return(sqrt(sum((test$yp-test$y)**2))/100)
}
res = data.frame(minsplit = rep(0,27),cp = rep(0,27),maxdepth = rep(0,27),RMSE = rep(0,27))
i=1
for (p1 in c(2,5,10)){
for (p2 in c(0.01,0.1,0.001)){
for (p3 in c(5,10,30)){
res[i,] = c(p1,p2,p3,fit_grid_search(p1,p2,p3))
i=i+1
}
}
}
out = res[res$RMSE==min(res$RMSE),][1,]
cat("The Best Parameters are:")
## The Best Parameters are:
out
## minsplit cp maxdepth RMSE
## 1 2 0.01 5 0.03765656
Yes the distance matters. The closer distance between points in tSNE indicates their original dimension space are also close, since tSNE maintains the local structure of data.
It means to estimate the number of close neighbors each point has and can be used to balance attention between local and global aspects of your data.
If the number of steps is small, the tSNE will end before it converges, therefore we may see some strange pinched shapes. The optimal number of steps is only dependent on the data sample, and different data sets have difference numbers of iterations to converge.
We can utilize containment, which is one of the simplest topological properties of the plots since to read the topological information from tSNE plot typically requires views at multiple perplexities. The tSNE with low perplexity value exaggerates the size of the smaller group of points a lot. Also the trefoil knot, since low perplexity values failed to show global connectivity.
# Get MNIST data
mnist_raw <- read_csv("https://pjreddie.com/media/files/mnist_train.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
# What is the dimension of the data set
dim(mnist_raw) # first column is the value, the rest are the pixels
## [1] 60000 785
# Rearranging the data
pixels_gathered <- mnist_raw %>% head(10000) %>%
rename(label = X1) %>%
mutate(instance = row_number()) %>%
gather(pixel, value, -label, -instance) %>%
extract(pixel, "pixel", "(\\d+)", convert = TRUE) %>%
mutate(pixel = pixel - 2,
x = pixel %% 28,
y = 28 - pixel %/% 28)
first_10k_samples = mnist_raw[1:10000,-1] #%>% as.matrix()
first_10k_samples_labels = mnist_raw[1:10000,1] %>% unlist(use.names=F)
colors = brewer.pal(10, 'Spectral')
# Visualizing the data
theme_set(theme_light())
pixels_gathered %>%
filter(instance <= 12) %>%
ggplot(aes(x, y, fill = value)) +
geom_tile() +
facet_grid(label~ instance )
##############################################
##### Visualizing the PCA decomposition #####
##############################################
pca = princomp(first_10k_samples)$scores[,1:2]
pca_plot = tibble(x = pca[,1], y =pca[,2], labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = pca_plot) + geom_text() +
xlab('PCA component 1') +ylab('PCA component 2')
##############################################
##### Running the TSNE emebdding #####
##############################################
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 5,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 6.86 seconds (sparsity = 0.002104)!
## Learning embedding...
## Iteration 50: error is 118.296618 (50 iterations in 1.50 seconds)
## Iteration 100: error is 105.842160 (50 iterations in 1.47 seconds)
## Iteration 150: error is 99.017378 (50 iterations in 1.33 seconds)
## Iteration 200: error is 96.339505 (50 iterations in 1.37 seconds)
## Iteration 250: error is 94.828513 (50 iterations in 1.41 seconds)
## Iteration 300: error is 4.354311 (50 iterations in 1.31 seconds)
## Iteration 350: error is 3.795932 (50 iterations in 1.31 seconds)
## Iteration 400: error is 3.429951 (50 iterations in 1.35 seconds)
## Iteration 450: error is 3.165247 (50 iterations in 1.38 seconds)
## Iteration 500: error is 2.961574 (50 iterations in 1.42 seconds)
## Fitting performed in 13.84 seconds.
itercosts = c()
itercosts = c(itercosts, embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 20,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 8.01 seconds (sparsity = 0.008217)!
## Learning embedding...
## Iteration 50: error is 102.343945 (50 iterations in 1.50 seconds)
## Iteration 100: error is 93.474841 (50 iterations in 1.57 seconds)
## Iteration 150: error is 89.073676 (50 iterations in 1.43 seconds)
## Iteration 200: error is 88.341450 (50 iterations in 1.48 seconds)
## Iteration 250: error is 88.067594 (50 iterations in 1.58 seconds)
## Iteration 300: error is 3.413447 (50 iterations in 1.51 seconds)
## Iteration 350: error is 2.965237 (50 iterations in 1.41 seconds)
## Iteration 400: error is 2.713987 (50 iterations in 1.42 seconds)
## Iteration 450: error is 2.546516 (50 iterations in 1.42 seconds)
## Iteration 500: error is 2.424106 (50 iterations in 1.44 seconds)
## Fitting performed in 14.77 seconds.
itercosts = c(itercosts, embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 60,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 60.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 9.82 seconds (sparsity = 0.024328)!
## Learning embedding...
## Iteration 50: error is 89.428527 (50 iterations in 2.76 seconds)
## Iteration 100: error is 84.414176 (50 iterations in 4.83 seconds)
## Iteration 150: error is 81.590132 (50 iterations in 2.29 seconds)
## Iteration 200: error is 81.447626 (50 iterations in 2.23 seconds)
## Iteration 250: error is 81.441815 (50 iterations in 2.22 seconds)
## Iteration 300: error is 2.624990 (50 iterations in 1.85 seconds)
## Iteration 350: error is 2.276628 (50 iterations in 1.79 seconds)
## Iteration 400: error is 2.099240 (50 iterations in 1.80 seconds)
## Iteration 450: error is 1.988563 (50 iterations in 1.78 seconds)
## Iteration 500: error is 1.912788 (50 iterations in 1.81 seconds)
## Fitting performed in 23.36 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 100,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 100.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 12.92 seconds (sparsity = 0.040571)!
## Learning embedding...
## Iteration 50: error is 83.371128 (50 iterations in 2.29 seconds)
## Iteration 100: error is 82.282542 (50 iterations in 2.33 seconds)
## Iteration 150: error is 78.375327 (50 iterations in 2.32 seconds)
## Iteration 200: error is 78.288771 (50 iterations in 2.25 seconds)
## Iteration 250: error is 78.273019 (50 iterations in 2.32 seconds)
## Iteration 300: error is 2.294505 (50 iterations in 2.14 seconds)
## Iteration 350: error is 1.991496 (50 iterations in 2.08 seconds)
## Iteration 400: error is 1.839862 (50 iterations in 2.08 seconds)
## Iteration 450: error is 1.749033 (50 iterations in 2.07 seconds)
## Iteration 500: error is 1.688850 (50 iterations in 2.09 seconds)
## Fitting performed in 21.97 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 125,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 125.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 16.20 seconds (sparsity = 0.050806)!
## Learning embedding...
## Iteration 50: error is 80.714217 (50 iterations in 2.66 seconds)
## Iteration 100: error is 79.919149 (50 iterations in 3.70 seconds)
## Iteration 150: error is 77.193905 (50 iterations in 3.67 seconds)
## Iteration 200: error is 76.759679 (50 iterations in 2.79 seconds)
## Iteration 250: error is 76.724228 (50 iterations in 2.56 seconds)
## Iteration 300: error is 2.144039 (50 iterations in 2.36 seconds)
## Iteration 350: error is 1.856290 (50 iterations in 2.31 seconds)
## Iteration 400: error is 1.724844 (50 iterations in 2.33 seconds)
## Iteration 450: error is 1.645857 (50 iterations in 2.34 seconds)
## Iteration 500: error is 1.593304 (50 iterations in 2.37 seconds)
## Fitting performed in 27.08 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 160,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 160.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 21.22 seconds (sparsity = 0.065215)!
## Learning embedding...
## Iteration 50: error is 77.767491 (50 iterations in 2.81 seconds)
## Iteration 100: error is 77.722848 (50 iterations in 4.52 seconds)
## Iteration 150: error is 75.326630 (50 iterations in 4.51 seconds)
## Iteration 200: error is 74.978892 (50 iterations in 3.68 seconds)
## Iteration 250: error is 74.924086 (50 iterations in 2.98 seconds)
## Iteration 300: error is 2.013861 (50 iterations in 2.66 seconds)
## Iteration 350: error is 1.756062 (50 iterations in 2.64 seconds)
## Iteration 400: error is 1.628438 (50 iterations in 2.62 seconds)
## Iteration 450: error is 1.552712 (50 iterations in 2.63 seconds)
## Iteration 500: error is 1.502295 (50 iterations in 2.62 seconds)
## Fitting performed in 31.67 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
The clusters will become very dispersed, there will be clear decision boundaries between each group.
plot(c(5,20,60,100,125,160), itercosts, type='l', xlab='perplexity', ylab='KL Divergence')
The optimal perplexity is \(160\).
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 1,
theta = 0.5,
eta = 10,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 1.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 7.70 seconds (sparsity = 0.000442)!
## Learning embedding...
## Iteration 50: error is 135.353674 (50 iterations in 1.33 seconds)
## Iteration 100: error is 135.353671 (50 iterations in 1.43 seconds)
## Iteration 150: error is 135.353312 (50 iterations in 1.53 seconds)
## Iteration 200: error is 135.176529 (50 iterations in 1.63 seconds)
## Iteration 250: error is 126.204338 (50 iterations in 1.79 seconds)
## Iteration 300: error is 6.730701 (50 iterations in 1.52 seconds)
## Iteration 350: error is 6.162342 (50 iterations in 1.74 seconds)
## Iteration 400: error is 5.783523 (50 iterations in 2.06 seconds)
## Iteration 450: error is 5.492356 (50 iterations in 2.10 seconds)
## Iteration 500: error is 5.253127 (50 iterations in 2.25 seconds)
## Fitting performed in 17.39 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
The plot above shows all the groups are mixed together, this means that we the perlexity is too small for the tSNE to converge.
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 160,
theta = 0.5,
eta = 10,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 160.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 20.56 seconds (sparsity = 0.065215)!
## Learning embedding...
## Iteration 50: error is 77.767491 (50 iterations in 2.79 seconds)
## Iteration 100: error is 77.767491 (50 iterations in 2.80 seconds)
## Iteration 150: error is 77.767491 (50 iterations in 2.88 seconds)
## Iteration 200: error is 77.767491 (50 iterations in 3.37 seconds)
## Iteration 250: error is 77.767491 (50 iterations in 4.41 seconds)
## Iteration 300: error is 3.986898 (50 iterations in 7.59 seconds)
## Iteration 350: error is 3.103577 (50 iterations in 5.55 seconds)
## Iteration 400: error is 2.530788 (50 iterations in 3.15 seconds)
## Iteration 450: error is 2.264327 (50 iterations in 2.96 seconds)
## Iteration 500: error is 2.111666 (50 iterations in 2.87 seconds)
## Fitting performed in 38.38 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 160,
theta = 0.5,
eta = 100,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 160.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 20.55 seconds (sparsity = 0.065215)!
## Learning embedding...
## Iteration 50: error is 77.767491 (50 iterations in 2.84 seconds)
## Iteration 100: error is 77.767489 (50 iterations in 3.20 seconds)
## Iteration 150: error is 76.452067 (50 iterations in 3.56 seconds)
## Iteration 200: error is 75.046796 (50 iterations in 3.37 seconds)
## Iteration 250: error is 74.932216 (50 iterations in 3.18 seconds)
## Iteration 300: error is 2.073030 (50 iterations in 2.72 seconds)
## Iteration 350: error is 1.815927 (50 iterations in 2.66 seconds)
## Iteration 400: error is 1.685659 (50 iterations in 2.68 seconds)
## Iteration 450: error is 1.606205 (50 iterations in 2.69 seconds)
## Iteration 500: error is 1.551745 (50 iterations in 2.68 seconds)
## Fitting performed in 29.60 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
embedding = Rtsne(X = first_10k_samples, dims = 2,
perplexity = 160,
theta = 0.5,
eta = 200,
pca = TRUE, verbose = TRUE,
max_iter = 500)
## Performing PCA
## Read the 10000 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 160.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 10000
## Done in 20.33 seconds (sparsity = 0.065215)!
## Learning embedding...
## Iteration 50: error is 77.767491 (50 iterations in 2.97 seconds)
## Iteration 100: error is 77.715503 (50 iterations in 3.73 seconds)
## Iteration 150: error is 75.305359 (50 iterations in 3.43 seconds)
## Iteration 200: error is 75.271877 (50 iterations in 3.70 seconds)
## Iteration 250: error is 74.945991 (50 iterations in 3.20 seconds)
## Iteration 300: error is 2.029576 (50 iterations in 2.68 seconds)
## Iteration 350: error is 1.757723 (50 iterations in 2.68 seconds)
## Iteration 400: error is 1.632282 (50 iterations in 2.64 seconds)
## Iteration 450: error is 1.558068 (50 iterations in 2.64 seconds)
## Iteration 500: error is 1.510134 (50 iterations in 2.62 seconds)
## Fitting performed in 30.29 seconds.
itercosts = c(itercosts,embedding$itercosts[10])
# Visualizing TSNE output
embedding_plot = tibble(x = embedding$Y[,1], y = embedding$Y[,2],
labels = as.character(first_10k_samples_labels))
ggplot(aes(x = x, y=y,label = labels, color = labels), data = embedding_plot) +
geom_text() +xlab('tSNE dimension 1') +ylab('tSNE dimension 2"')
Larger learning rate leads to more dispersed clustering result based on the plots above when eta=10, 100 and 200.